{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ca835f57-a1f0-4e12-8565-672a8fbc6813",
   "metadata": {},
   "source": [
    "This notebook reporoduces the training and classification for the Bag of Words classifiers in table 3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b5bc4017-0a2f-46c5-bde9-0a4a4788c1c1",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\mikeb\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\mikeb\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.metrics import matthews_corrcoef, classification_report\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "# download stopwords and punctuation dictionaries\n",
    "nltk.download('stopwords')\n",
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b7a7bbb-c57f-4b47-894f-9918e4c3269e",
   "metadata": {},
   "source": [
    "## SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "31756313-75d0-4ff2-92ce-34f1b163896d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load training data\n",
    "df = pd.read_csv('./trump_train_data.csv')\n",
    "# Create a TF-IDF vectorizer\n",
    "vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words = 'english')\n",
    "corpus = vectorizer.fit_transform(df['text'])\n",
    "# split the training data for the grid search\n",
    "X_train, X_test, y_train, y_test = train_test_split(corpus, df['stance_sup'], test_size=0.2, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "38fd0b8d-d688-4c5b-8d70-23dbb8611802",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.80      0.92      0.85       341\n",
      "         1.0       0.81      0.57      0.67       189\n",
      "\n",
      "    accuracy                           0.80       530\n",
      "   macro avg       0.80      0.75      0.76       530\n",
      "weighted avg       0.80      0.80      0.79       530\n",
      "\n",
      "0.5457241297427546\n",
      "Best Hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}\n",
      "CPU times: total: 38 s\n",
      "Wall time: 38.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Define a parameter grid for GridSearchCV\n",
    "param_grid = {\n",
    "    'C': [0.1, 1, 10],\n",
    "    'kernel': ['linear', 'rbf'],\n",
    "    'gamma': [ 0.1, 1, 2, 10]\n",
    "}\n",
    "\n",
    "# Create an SVM classifier\n",
    "svm = SVC(random_state=1)\n",
    "\n",
    "# Create a GridSearchCV object\n",
    "grid_search = GridSearchCV(svm, param_grid, scoring='roc_auc')\n",
    "\n",
    "# Perform grid search and return the best classifier\n",
    "grid_search.fit(X_train, y_train)\n",
    "best_classifier = grid_search.best_estimator_\n",
    "\n",
    "# Make predictions on the test data using the best classifier\n",
    "y_pred = best_classifier.predict(X_test)\n",
    "\n",
    "# Evaluate the classifier on the validation set\n",
    "mcc = matthews_corrcoef(y_test, y_pred)\n",
    "report = classification_report(y_test, y_pred)\n",
    "print(report)\n",
    "print(mcc)\n",
    "print(\"Best Hyperparameters:\", grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ab1b66cd-9888-4fc2-b7b9-c6d7809a62a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4940052598070652\n"
     ]
    }
   ],
   "source": [
    "# Import testing set\n",
    "testdocs = pd.read_csv('./trump_twitter_supervised.csv')\n",
    "# Vectorize test set\n",
    "testcorp = vectorizer.transform(testdocs['text'])\n",
    "# Predict and print results\n",
    "test_pred = best_classifier.predict(testcorp)\n",
    "print(matthews_corrcoef(testdocs['labels'], test_pred))\n",
    "\n",
    "# Add results to dataframe\n",
    "testdocs['svm'] = test_pred\n",
    "# Export results\n",
    "testdocs.to_csv('./trump_twitter_supervised.csv', index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f1ed61e5-4ead-49c5-9059-c40def98f997",
   "metadata": {},
   "source": [
    "## Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "971f476f-f5c9-4824-9744-294043cd204b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.75      0.96      0.84       341\n",
      "         1.0       0.86      0.43      0.57       189\n",
      "\n",
      "    accuracy                           0.77       530\n",
      "   macro avg       0.81      0.70      0.71       530\n",
      "weighted avg       0.79      0.77      0.75       530\n",
      "\n",
      "0.48962593455055947\n",
      "Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}\n",
      "CPU times: total: 14.1 s\n",
      "Wall time: 2min 5s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "param_grid = {\n",
    "    'n_estimators': [100, 200, 300],\n",
    "    'max_depth': [None, 10, 20, 30],\n",
    "    'min_samples_split': [2, 5, 10],\n",
    "    'min_samples_leaf': [1, 2, 4],\n",
    "}\n",
    "\n",
    "# Create a Random Forest classifier\n",
    "rf_classifier = RandomForestClassifier(random_state=1)\n",
    "\n",
    "# Create a GridSearchCV object\n",
    "grid_search = GridSearchCV(rf_classifier, param_grid, scoring='roc_auc', n_jobs=-1)\n",
    "\n",
    "# Perform grid search and return the best classifier\n",
    "grid_search.fit(X_train, y_train)\n",
    "best_classifier = grid_search.best_estimator_\n",
    "\n",
    "# Make predictions on the test data using the best classifier\n",
    "y_pred = best_classifier.predict(X_test)\n",
    "\n",
    "# Evaluate the classifier\n",
    "mcc = matthews_corrcoef(y_test, y_pred)\n",
    "report = classification_report(y_test, y_pred)\n",
    "print(report)\n",
    "print(mcc)\n",
    "print(\"Best Hyperparameters:\", grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8b909ad8-3c3d-4edf-a372-376d3a87c359",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.48939628851804473\n"
     ]
    }
   ],
   "source": [
    "# Import testing set\n",
    "testdocs = pd.read_csv('./trump_twitter_supervised.csv')\n",
    "# Vectorize testing docs\n",
    "testcorp = vectorizer.transform(testdocs['text'])\n",
    "# Predict and print results\n",
    "test_pred = best_classifier.predict(testcorp)\n",
    "print(matthews_corrcoef(testdocs['labels'], test_pred))\n",
    "\n",
    "# Add results to dataframe\n",
    "testdocs['forest'] = test_pred\n",
    "# Export results\n",
    "testdocs.to_csv('./trump_twitter_supervised.csv', index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39a19b6b-faa4-478f-8c96-0174f90b69eb",
   "metadata": {},
   "source": [
    "## Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "101e2c61-1618-45c9-8397-62d162ee0ada",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.77      0.96      0.85       341\n",
      "         1.0       0.87      0.48      0.61       189\n",
      "\n",
      "    accuracy                           0.79       530\n",
      "   macro avg       0.82      0.72      0.73       530\n",
      "weighted avg       0.80      0.79      0.77       530\n",
      "\n",
      "0.52482002507182\n",
      "Best Hyperparameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}\n",
      "CPU times: total: 469 ms\n",
      "Wall time: 1.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Define a parameter grid for GridSearchCV for Logistic Regression\n",
    "param_grid = {\n",
    "    'C': [0.001, 0.01, 0.1, 1, 10],\n",
    "    'penalty': ['l1', 'l2'],\n",
    "    'solver': ['liblinear', 'saga'],\n",
    "}\n",
    "\n",
    "# Create a Logistic Regression classifier\n",
    "lr_classifier = LogisticRegression()\n",
    "\n",
    "# Create a GridSearchCV object\n",
    "grid_search = GridSearchCV(lr_classifier, param_grid, scoring='roc_auc', n_jobs=-1)\n",
    "\n",
    "# Perform grid search and train the best classifier\n",
    "grid_search.fit(X_train, y_train)\n",
    "best_classifier = grid_search.best_estimator_\n",
    "\n",
    "# Make predictions on the test data using the best classifier\n",
    "y_pred = best_classifier.predict(X_test)\n",
    "\n",
    "# Evaluate the classifier\n",
    "mcc = matthews_corrcoef(y_test, y_pred)\n",
    "report = classification_report(y_test, y_pred)\n",
    "print(report)\n",
    "print(mcc)\n",
    "print(\"Best Hyperparameters:\", grid_search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fbce341e-e870-4326-aa7b-66f8b7dbd87d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5104852878800066\n"
     ]
    }
   ],
   "source": [
    "# Import testing set\n",
    "testdocs = pd.read_csv('./trump_twitter_supervised.csv')\n",
    "# Vectorize testing docs\n",
    "testcorp = vectorizer.transform(testdocs['text'])\n",
    "# Predict and print results\n",
    "test_pred = best_classifier.predict(testcorp)\n",
    "print(matthews_corrcoef(testdocs['labels'], test_pred))\n",
    "\n",
    "# Add results to dataframe\n",
    "testdocs['logistic'] = test_pred\n",
    "# Export results\n",
    "testdocs.to_csv('./trump_twitter_supervised.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
